{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas 12 Utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scipy.stats as stat\n",
    "\n",
    "data = pd.read_csv('./res/train.csv', index_col='Loan_ID')\n",
    "\n",
    "data[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. boolean index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conds = (data['Gender']=='Female') & (data['Education']=='Not Graduate') & (data['Loan_Status'] == 'Y')\n",
    "print(conds[:10])\n",
    "data.loc[conds, ['Gender', 'Education', 'Loan_Status']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. apply function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def num_null(x):\n",
    "    return sum(pd.isnull(x))\n",
    "\n",
    "print('column null count:')\n",
    "print(data.apply(func=num_null, axis=0))\n",
    "print('\\nrow null count:')\n",
    "print(data.apply(func=num_null, axis=1).head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. fill null value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = data['Gender']\n",
    "g = s1.mode()\n",
    "print(\"type:\", type(g), \" col0:\", g[0])\n",
    "# stat.mode(s1.dropna())\n",
    "data['Gender'].fillna(g[0], inplace=True)\n",
    "data['Married'].fillna(data['Married'].mode()[0], inplace=True)\n",
    "data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)\n",
    "\n",
    "print(sum(data['Gender'].isnull()), sum(data['Married'].isnull()), sum(data['Self_Employed'].isnull()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. pivot table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2 x 2 x 2\n",
    "v1 = data.pivot_table(values=['LoanAmount'], index=['Gender', 'Married', 'Self_Employed'], aggfunc=np.mean)\n",
    "print(v1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. composite index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(sum(data['LoanAmount'].isnull()))\n",
    "for i, row in data.loc[data['LoanAmount'].isnull(), :].iterrows():\n",
    "    t = tuple([row['Gender'], row['Married'], row['Self_Employed']])\n",
    "    data.loc[i, 'LoanAmount'] = v1.loc[t].values[0]\n",
    " \n",
    "print(\"Again check loanAmount na count:\")\n",
    "data.apply(func=num_null, axis=0)\n",
    "data.apply()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. crosstab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ct = pd.crosstab(index=data['Credit_History'], columns=data['Loan_Status'], margins=True)\n",
    "print(ct)\n",
    "\n",
    "def colums_per_row_cal(row):\n",
    "    return row / float(row[-1])\n",
    "\n",
    "ct = pd.crosstab(data['Credit_History'], data['Loan_Status'], margins=True).apply(func=colums_per_row_cal, axis=1)\n",
    "ct"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. merge dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prop_rates = pd.DataFrame(data=[1000, 5000, 12000], index=['Rural','Semiurban','Urban'],columns=['rates'])\n",
    "prop_rates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "merge_data = data.merge(right=prop_rates, how='inner', left_on='Property_Area', right_index=True, sort=False)\n",
    "# merge_data[:5].loc[:,['Credit_History', 'Property_Area', 'rates']]\n",
    "merge_data[['Credit_History', 'Property_Area', 'rates']].head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge_data[-5:].loc[:,['Credit_History', 'Property_Area', 'rates']]\n",
    "merge_data[['Credit_History', 'Property_Area', 'rates']].tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# values is not usefull, only for simple calculate (len), can use other colums instead\n",
    "merge_data.pivot_table(values='Credit_History', index=['Property_Area', 'rates'], aggfunc=len)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. sort dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_data = data.sort_values(['ApplicantIncome', 'CoapplicantIncome'], ascending=False)\n",
    "sorted_data[['ApplicantIncome', 'CoapplicantIncome']].head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. boxplot and histogram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.boxplot(column='ApplicantIncome', by='Loan_Status')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data.hist(column='ApplicantIncome', by='Loan_Status', bins=30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. cut bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def binning(col, cut_points, cut_labels):\n",
    "    minval = col.min()\n",
    "    maxval = col.max()\n",
    "    break_points = [minval] + cut_points + [maxval]\n",
    "    return pd.cut(x=col, bins=break_points, labels=cut_labels, include_lowest=True)\n",
    "cut_points = [90, 140, 190]\n",
    "# [20] + cut_points + [220]\n",
    "labels = ['low', 'medium', 'high', 'very high']\n",
    "data['LoanAmount_Bin'] = binning(data['LoanAmount'], cut_points, labels)\n",
    "data[['LoanAmount_Bin', 'LoanAmount']].head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 11. replace (name varibal encoding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def encoding(col, codedict):\n",
    "    colCoded = pd.Series(col, copy=True)\n",
    "    for k, v in codedict.items():\n",
    "        colCoded.replace(k, v, inplace=True)\n",
    "    return colCoded;\n",
    "\n",
    "data['Loan_Status_Coded'] = encoding(data['Loan_Status'], {'N':0, 'Y':1})\n",
    "data[['Loan_Status', 'Loan_Status_Coded']].head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 12. row iterator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['Credit_History'].head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# skiprows: Loan_ID\n",
    "data_types = pd.read_csv('./res/datatypes.csv', skiprows=[1])\n",
    "data_types.tail(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, row in data_types.iterrows():\n",
    "    if row['type'] == 'categorical':\n",
    "        data[row['feature']] = data[row['feature']].astype(np.object)\n",
    "    elif row['type'] == 'continuous':\n",
    "        data[row['feature']] = data[row['feature']].astype(np.float)\n",
    "data.dtypes "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
